import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
sd = pd.read_csv("Sales Data.csv")
sd
| Unnamed: 0 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 295665 | Macbook Pro Laptop | 1 | 1700.00 | 2019-12-30 00:01:00 | 136 Church St, New York City, NY 10001 | 12 | 1700.00 | New York City | 0 |
| 1 | 1 | 295666 | LG Washing Machine | 1 | 600.00 | 2019-12-29 07:03:00 | 562 2nd St, New York City, NY 10001 | 12 | 600.00 | New York City | 7 |
| 2 | 2 | 295667 | USB-C Charging Cable | 1 | 11.95 | 2019-12-12 18:21:00 | 277 Main St, New York City, NY 10001 | 12 | 11.95 | New York City | 18 |
| 3 | 3 | 295668 | 27in FHD Monitor | 1 | 149.99 | 2019-12-22 15:13:00 | 410 6th St, San Francisco, CA 94016 | 12 | 149.99 | San Francisco | 15 |
| 4 | 4 | 295669 | USB-C Charging Cable | 1 | 11.95 | 2019-12-18 12:38:00 | 43 Hill St, Atlanta, GA 30301 | 12 | 11.95 | Atlanta | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 185945 | 13617 | 222905 | AAA Batteries (4-pack) | 1 | 2.99 | 2019-06-07 19:02:00 | 795 Pine St, Boston, MA 02215 | 6 | 2.99 | Boston | 19 |
| 185946 | 13618 | 222906 | 27in FHD Monitor | 1 | 149.99 | 2019-06-01 19:29:00 | 495 North St, New York City, NY 10001 | 6 | 149.99 | New York City | 19 |
| 185947 | 13619 | 222907 | USB-C Charging Cable | 1 | 11.95 | 2019-06-22 18:57:00 | 319 Ridge St, San Francisco, CA 94016 | 6 | 11.95 | San Francisco | 18 |
| 185948 | 13620 | 222908 | USB-C Charging Cable | 1 | 11.95 | 2019-06-26 18:35:00 | 916 Main St, San Francisco, CA 94016 | 6 | 11.95 | San Francisco | 18 |
| 185949 | 13621 | 222909 | AAA Batteries (4-pack) | 1 | 2.99 | 2019-06-25 14:33:00 | 209 11th St, Atlanta, GA 30301 | 6 | 2.99 | Atlanta | 14 |
185950 rows × 11 columns
sd.head()
| Unnamed: 0 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 295665 | Macbook Pro Laptop | 1 | 1700.00 | 2019-12-30 00:01:00 | 136 Church St, New York City, NY 10001 | 12 | 1700.00 | New York City | 0 |
| 1 | 1 | 295666 | LG Washing Machine | 1 | 600.00 | 2019-12-29 07:03:00 | 562 2nd St, New York City, NY 10001 | 12 | 600.00 | New York City | 7 |
| 2 | 2 | 295667 | USB-C Charging Cable | 1 | 11.95 | 2019-12-12 18:21:00 | 277 Main St, New York City, NY 10001 | 12 | 11.95 | New York City | 18 |
| 3 | 3 | 295668 | 27in FHD Monitor | 1 | 149.99 | 2019-12-22 15:13:00 | 410 6th St, San Francisco, CA 94016 | 12 | 149.99 | San Francisco | 15 |
| 4 | 4 | 295669 | USB-C Charging Cable | 1 | 11.95 | 2019-12-18 12:38:00 | 43 Hill St, Atlanta, GA 30301 | 12 | 11.95 | Atlanta | 12 |
sd.describe()
| Unnamed: 0 | Order ID | Quantity Ordered | Price Each | Month | Sales | Hour | |
|---|---|---|---|---|---|---|---|
| count | 185950.000000 | 185950.000000 | 185950.000000 | 185950.000000 | 185950.000000 | 185950.000000 | 185950.000000 |
| mean | 8340.388475 | 230417.569379 | 1.124383 | 184.399735 | 7.059140 | 185.490917 | 14.413305 |
| std | 5450.554093 | 51512.737110 | 0.442793 | 332.731330 | 3.502996 | 332.919771 | 5.423416 |
| min | 0.000000 | 141234.000000 | 1.000000 | 2.990000 | 1.000000 | 2.990000 | 0.000000 |
| 25% | 3894.000000 | 185831.250000 | 1.000000 | 11.950000 | 4.000000 | 11.950000 | 11.000000 |
| 50% | 7786.000000 | 230367.500000 | 1.000000 | 14.950000 | 7.000000 | 14.950000 | 15.000000 |
| 75% | 11872.000000 | 275035.750000 | 1.000000 | 150.000000 | 10.000000 | 150.000000 | 19.000000 |
| max | 25116.000000 | 319670.000000 | 9.000000 | 1700.000000 | 12.000000 | 3400.000000 | 23.000000 |
sd.columns
Index(['Unnamed: 0', 'Order ID', 'Product', 'Quantity Ordered', 'Price Each',
'Order Date', 'Purchase Address', 'Month', 'Sales', 'City', 'Hour'],
dtype='object')
sd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 185950 entries, 0 to 185949 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 185950 non-null int64 1 Order ID 185950 non-null int64 2 Product 185950 non-null object 3 Quantity Ordered 185950 non-null int64 4 Price Each 185950 non-null float64 5 Order Date 185950 non-null object 6 Purchase Address 185950 non-null object 7 Month 185950 non-null int64 8 Sales 185950 non-null float64 9 City 185950 non-null object 10 Hour 185950 non-null int64 dtypes: float64(2), int64(5), object(4) memory usage: 15.6+ MB
sd.count()
Unnamed: 0 185950 Order ID 185950 Product 185950 Quantity Ordered 185950 Price Each 185950 Order Date 185950 Purchase Address 185950 Month 185950 Sales 185950 City 185950 Hour 185950 dtype: int64
sd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 185950 entries, 0 to 185949 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 185950 non-null int64 1 Order ID 185950 non-null int64 2 Product 185950 non-null object 3 Quantity Ordered 185950 non-null int64 4 Price Each 185950 non-null float64 5 Order Date 185950 non-null object 6 Purchase Address 185950 non-null object 7 Month 185950 non-null int64 8 Sales 185950 non-null float64 9 City 185950 non-null object 10 Hour 185950 non-null int64 dtypes: float64(2), int64(5), object(4) memory usage: 15.6+ MB
sd['Order Date'] = pd.to_datetime(sd['Order Date'])
sd['Order Date']
0 2019-12-30 00:01:00
1 2019-12-29 07:03:00
2 2019-12-12 18:21:00
3 2019-12-22 15:13:00
4 2019-12-18 12:38:00
...
185945 2019-06-07 19:02:00
185946 2019-06-01 19:29:00
185947 2019-06-22 18:57:00
185948 2019-06-26 18:35:00
185949 2019-06-25 14:33:00
Name: Order Date, Length: 185950, dtype: datetime64[ns]
sd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 185950 entries, 0 to 185949 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 185950 non-null int64 1 Order ID 185950 non-null int64 2 Product 185950 non-null object 3 Quantity Ordered 185950 non-null int64 4 Price Each 185950 non-null float64 5 Order Date 185950 non-null datetime64[ns] 6 Purchase Address 185950 non-null object 7 Month 185950 non-null int64 8 Sales 185950 non-null float64 9 City 185950 non-null object 10 Hour 185950 non-null int64 dtypes: datetime64[ns](1), float64(2), int64(5), object(3) memory usage: 15.6+ MB
sd.isnull().sum()
Unnamed: 0 0 Order ID 0 Product 0 Quantity Ordered 0 Price Each 0 Order Date 0 Purchase Address 0 Month 0 Sales 0 City 0 Hour 0 dtype: int64
sd['Product'].unique()
array(['Macbook Pro Laptop', 'LG Washing Machine', 'USB-C Charging Cable',
'27in FHD Monitor', 'AA Batteries (4-pack)',
'Bose SoundSport Headphones', 'AAA Batteries (4-pack)',
'ThinkPad Laptop', 'Lightning Charging Cable', 'Google Phone',
'Wired Headphones', 'Apple Airpods Headphones', 'Vareebadd Phone',
'iPhone', '20in Monitor', '34in Ultrawide Monitor',
'Flatscreen TV', '27in 4K Gaming Monitor', 'LG Dryer'],
dtype=object)
Products= sd.groupby('Product')['Quantity Ordered'].sum().sort_values()
Products
Product LG Dryer 646 LG Washing Machine 666 Vareebadd Phone 2068 20in Monitor 4129 ThinkPad Laptop 4130 Macbook Pro Laptop 4728 Flatscreen TV 4819 Google Phone 5532 34in Ultrawide Monitor 6199 27in 4K Gaming Monitor 6244 iPhone 6849 27in FHD Monitor 7550 Bose SoundSport Headphones 13457 Apple Airpods Headphones 15661 Wired Headphones 20557 Lightning Charging Cable 23217 USB-C Charging Cable 23975 AA Batteries (4-pack) 27635 AAA Batteries (4-pack) 31017 Name: Quantity Ordered, dtype: int64
sns.barplot(y=Products.index, x=Products.values).set(title='Quantity Ordered per Product')
plt.show()
sd['Year'] = pd.to_datetime(sd['Order Date']).dt.year
sd.head()
| Unnamed: 0 | Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 295665 | Macbook Pro Laptop | 1 | 1700.00 | 2019-12-30 00:01:00 | 136 Church St, New York City, NY 10001 | 12 | 1700.00 | New York City | 0 | 2019 |
| 1 | 1 | 295666 | LG Washing Machine | 1 | 600.00 | 2019-12-29 07:03:00 | 562 2nd St, New York City, NY 10001 | 12 | 600.00 | New York City | 7 | 2019 |
| 2 | 2 | 295667 | USB-C Charging Cable | 1 | 11.95 | 2019-12-12 18:21:00 | 277 Main St, New York City, NY 10001 | 12 | 11.95 | New York City | 18 | 2019 |
| 3 | 3 | 295668 | 27in FHD Monitor | 1 | 149.99 | 2019-12-22 15:13:00 | 410 6th St, San Francisco, CA 94016 | 12 | 149.99 | San Francisco | 15 | 2019 |
| 4 | 4 | 295669 | USB-C Charging Cable | 1 | 11.95 | 2019-12-18 12:38:00 | 43 Hill St, Atlanta, GA 30301 | 12 | 11.95 | Atlanta | 12 | 2019 |
month_df = sd.groupby(['Year','Month'])
Monthly_sales = pd.DataFrame(month_df['Sales'].sum(), columns = ['Sales'])
avg_order_month = month_df ['Sales'].mean()
avg_order_month = avg_order_month.reset_index()
Monthly_sales = Monthly_sales.reset_index()
Monthly_sales = Monthly_sales.loc[Monthly_sales['Year']==2019]
Monthly_sales['AOV'] = avg_order_month['Sales']
Monthly_sales.sort_values
<bound method DataFrame.sort_values of Year Month Sales AOV 0 2019 1 1813586.44 187.450795 1 2019 2 2202022.42 183.884962 2 2019 3 2807100.38 185.250471 3 2019 4 3390670.24 185.495390 4 2019 5 3152606.75 190.305852 5 2019 6 2577802.26 190.187565 6 2019 7 2647775.76 185.249826 7 2019 8 2244467.88 187.648849 8 2019 9 2097560.13 180.497387 9 2019 10 3736726.88 184.238580 10 2019 11 3199603.20 182.074956 11 2019 12 4613443.34 184.655913>
plt.figure(figsize=(10, 6))
sns.barplot(data=Monthly_sales, x='Month', y='Sales')
plt.title('Every Month Sales')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.show()
fig = px.bar(data_frame=Monthly_sales, x='Month', y='Sales', title='Every Month Sales')
fig.show()
product_sales = sd.groupby(['Year','Product'])
product_sales = pd.DataFrame(product_sales["Sales"].sum(), columns=["Sales"])
product_sales = product_sales.reset_index()
product_sales
| Year | Product | Sales | |
|---|---|---|---|
| 0 | 2019 | 20in Monitor | 453818.74 |
| 1 | 2019 | 27in 4K Gaming Monitor | 2434707.57 |
| 2 | 2019 | 27in FHD Monitor | 1131974.53 |
| 3 | 2019 | 34in Ultrawide Monitor | 2355558.01 |
| 4 | 2019 | AA Batteries (4-pack) | 106118.40 |
| 5 | 2019 | AAA Batteries (4-pack) | 92725.88 |
| 6 | 2019 | Apple Airpods Headphones | 2348550.00 |
| 7 | 2019 | Bose SoundSport Headphones | 1345265.46 |
| 8 | 2019 | Flatscreen TV | 1445400.00 |
| 9 | 2019 | Google Phone | 3318600.00 |
| 10 | 2019 | LG Dryer | 387600.00 |
| 11 | 2019 | LG Washing Machine | 399600.00 |
| 12 | 2019 | Lightning Charging Cable | 347004.45 |
| 13 | 2019 | Macbook Pro Laptop | 8035900.00 |
| 14 | 2019 | ThinkPad Laptop | 4127958.72 |
| 15 | 2019 | USB-C Charging Cable | 286453.45 |
| 16 | 2019 | Vareebadd Phone | 826800.00 |
| 17 | 2019 | Wired Headphones | 246430.47 |
| 18 | 2019 | iPhone | 4792900.00 |
| 19 | 2020 | 20in Monitor | 329.97 |
| 20 | 2020 | 27in 4K Gaming Monitor | 389.99 |
| 21 | 2020 | 27in FHD Monitor | 449.97 |
| 22 | 2020 | AAA Batteries (4-pack) | 14.95 |
| 23 | 2020 | Apple Airpods Headphones | 600.00 |
| 24 | 2020 | Bose SoundSport Headphones | 299.97 |
| 25 | 2020 | Flatscreen TV | 300.00 |
| 26 | 2020 | Google Phone | 600.00 |
| 27 | 2020 | Lightning Charging Cable | 89.70 |
| 28 | 2020 | Macbook Pro Laptop | 1700.00 |
| 29 | 2020 | ThinkPad Laptop | 1999.98 |
| 30 | 2020 | USB-C Charging Cable | 47.80 |
| 31 | 2020 | Vareebadd Phone | 400.00 |
| 32 | 2020 | Wired Headphones | 47.96 |
| 33 | 2020 | iPhone | 1400.00 |
# Create a pie chart figure
fig = go.Figure(data=[go.Pie(labels=product_sales["Product"].loc[product_sales["Year"]==2019], values=product_sales["Sales"].loc[product_sales["Year"]==2019])])
# Set layout options
fig.update_layout(title='Popular Product for year 2019')
# Display the chart
fig.show()
Popular_Hour = sd.groupby(['Month','Hour'])
Popular_Hour = pd.DataFrame(Popular_Hour['Sales'].sum(), columns= ['Sales'])
Popular_Hour = Popular_Hour.reset_index()
Popular_Hour = Popular_Hour.loc[Popular_Hour["Month"]==12]
Popular_Hour
| Month | Hour | Sales | |
|---|---|---|---|
| 264 | 12 | 0 | 93795.21 |
| 265 | 12 | 1 | 63311.56 |
| 266 | 12 | 2 | 32250.44 |
| 267 | 12 | 3 | 16567.74 |
| 268 | 12 | 4 | 19626.14 |
| 269 | 12 | 5 | 33728.66 |
| 270 | 12 | 6 | 54690.49 |
| 271 | 12 | 7 | 120218.92 |
| 272 | 12 | 8 | 168953.52 |
| 273 | 12 | 9 | 217493.38 |
| 274 | 12 | 10 | 271169.35 |
| 275 | 12 | 11 | 300369.73 |
| 276 | 12 | 12 | 288275.69 |
| 277 | 12 | 13 | 294821.51 |
| 278 | 12 | 14 | 288622.89 |
| 279 | 12 | 15 | 269882.01 |
| 280 | 12 | 16 | 233328.75 |
| 281 | 12 | 17 | 290406.13 |
| 282 | 12 | 18 | 285893.25 |
| 283 | 12 | 19 | 323274.15 |
| 284 | 12 | 20 | 321235.30 |
| 285 | 12 | 21 | 255373.54 |
| 286 | 12 | 22 | 212241.78 |
| 287 | 12 | 23 | 157913.20 |
fig = go.Figure(data=go.Scatter(x=Popular_Hour["Hour"], y=Popular_Hour["Sales"]))
# Set layout options
fig.update_layout(title='Peak Hours in a Month')
# Display the chart
fig.show()
fig = px.bar(Popular_Hour, x='Hour', y='Sales', title=f'Peak Hours for Sales')
fig.show()
Area_Value = sd.groupby(['City'])
Area_Value = Area_Value['Sales'].sum().reset_index()
Area_Value
| City | Sales | |
|---|---|---|
| 0 | Atlanta | 2795498.58 |
| 1 | Austin | 1819581.75 |
| 2 | Boston | 3661642.01 |
| 3 | Dallas | 2767975.40 |
| 4 | Los Angeles | 5452570.80 |
| 5 | New York City | 4664317.43 |
| 6 | Portland | 2320490.61 |
| 7 | San Francisco | 8262203.91 |
| 8 | Seattle | 2747755.48 |
import plotly.express as px
import pandas as pd
# Plotting cities and population
fig = px.scatter(Area_Value, x='City', y='Sales', text='City',
title='Cities and Sales')
# Customize the plot layout
fig.update_layout(
xaxis_title='City',
yaxis_title='Sales',
hovermode='closest'
)
# Display the plot
fig.show()
m_sales=sd.groupby('Product')['Sales'].sum().nlargest(5).plot.bar(title='Best Product for Sales',figsize=(5,5))
m_sales
<Axes: title={'center': 'Best Product for Sales'}, xlabel='Product'>
m_sales=sd.groupby('Product')['Sales'].sum().nsmallest(5).plot.bar(title='Lowest Product for Sales',figsize=(5,5))
m_sales
<Axes: title={'center': 'Lowest Product for Sales'}, xlabel='Product'>
Top_City = sd.groupby('City')['Sales'].sum().nlargest(5)
Top_City
City San Francisco 8262203.91 Los Angeles 5452570.80 New York City 4664317.43 Boston 3661642.01 Atlanta 2795498.58 Name: Sales, dtype: float64
sns.barplot(y=Top_City.index,x=Top_City).set(title='Top Five City Sales')
[Text(0.5, 1.0, 'Top Five City Sales')]